Using Flickr API to retrieve XY coordinates on Sport Landmarks in Chicago, IL

In this analysis, I gathered XY locations in the Chicago area based on the location of these known arenas, as well as sport teams that compete there. While this class can extract user information, I am only taking the given location data to map out the results.

In [2]:
import flickrapi
import json
import time
import pandas

Enter your credentials

In [3]:
#Flickr API Key/Secret
API_KEY = 'API KEY'
API_SECRET = 'API SECRET'
In [4]:
# create the API class
flickr = flickrapi.FlickrAPI(API_KEY, API_SECRET, cache=True)

The created python class extracts the information andgathers the first 500 photos based on the user's input. Flickr API only allows 3600 photos to be retrieved every hour, so 500 was used to avoid any time limitations to the analysis.

In [5]:
#PhotoCollector class, where the input has to be the photos from flickr.walk command

class PhotoCollector:
    def __init__(self, flickrwalk):
        #flickrwalk as the only self
        self.flickrwalk = flickrwalk
    # Retrieving photos within 1 km from Wrigley Field in Chicago, IL
    def flickr_walk(self):
        #set a blank list to retrieve the list of photo and user profile id's
        ph_id_list = []
        per_id_list = []
        #for loop retrieving the photo urls from flickr API
        for i, photo_url in enumerate(photos):
            if i > 500: # only collect the first 50 photos
                break
            # get the photo url
            url = photo_url.get('url_c')
            if url is not None:
                # if there is a url to the photo, the for loop will put in the id numbers into both the lists.
                ph_id_list.append(photo_url.get('id'))
                per_id_list.append(photo_url.get('owner'))
        #return the lists so that the values are used in the Photo class.
        return ph_id_list, per_id_list
    
class Photo:
    def __init__(self, ph_id_list, per_id_list):
        self.ph_id_list = ph_id_list
        self.per_id_list = per_id_list
    
    def attributes(self):
        #create lists
        #lists are blank but will be added through the loops.
        uid_list = []
        rname_list = []
        phtitle_list = []
        phdesc_list = []
        phurl_list = []
        lat_list = []
        long_list = []
        tag_list = []
        date_list = []
        regloc_list = []
        count_list = []
        uname_list = []
        loc_list = []
        prof_list = []
        for i in ph_id_list: # looping in the photo information
            #gets info from the individual photo
            photo_info = flickr.photos.getInfo(photo_id= i, format='json')
            #helps decode to a string.
            photo_info_decode = photo_info.decode()
            #load json
            photo_info_load = json.loads(photo_info_decode)
            # USERNAME attribute
            uname = photo_info_load["photo"]["owner"]["username"]
            uname_list.append(uname)
            # REALNAME attribute
            rname = photo_info_load["photo"]["owner"]["realname"]
            rname_list.append(rname)
            # PHOTO TITLE attribute
            phtitle = photo_info_load["photo"]["title"]["_content"]
            phtitle_list.append(phtitle)
            # PHOTO DESCRIPTION attribute
            phdesc = photo_info_load["photo"]["description"]["_content"]
            phdesc_list.append(phdesc)
            # PHOTO URL attribute
            phurl = photo_info_load["photo"]["urls"]["url"]
            for u in phurl: # url is stored as a dict nested in a list [{}], this loops digs it out and appends it to the list.
                for k,item in u.items():
                    if k == "_content":
                        phurl_list.append(item)
            # PHOTO COORDINATES attribute
            ph_lat = photo_info_load["photo"]["location"]["latitude"]
            lat_list.append(ph_lat)
            ph_long = photo_info_load["photo"]["location"]["longitude"]
            long_list.append(ph_long)
            # PHOTO TAGS attribute
            tags = photo_info_load["photo"]["tags"]["tag"]
            raw_tag = []
            for t in tags: # tags is stored as a dict nested in a list [{}], this loops digs it out and appends it to the list.
                for k,item in t.items():
                    if k == "raw":
                        raw_tag.append(item)
            tag_list.append(raw_tag)
            # PHOTO TAKEN TIME attribute
            date = photo_info_load["photo"]["dates"]["taken"]
            date_list.append(date)
            # PHOTO REGISTERED LOCATION attribute
            regloc = photo_info_load["photo"]["owner"]["location"]
            regloc_list.append(regloc)
        for i in per_id_list: #person information - profile
            #gets information from the person's profile
            people_info = flickr.people.getInfo(user_id = i, format = 'json')
            #helps decode to a string
            people_info_decode = people_info.decode()
            #bring to json
            people_info_load = json.loads(people_info_decode)
            # PHOTOS COUNT attribute
            count = people_info_load["person"]["photos"]["count"]["_content"]
            count_list.append(count)
            # USER LOCATION attribute
            loc= people_info_load["person"]
            if "location" in loc: #some profile's do not share their location information, this queries if the user has one or not.
                loc = people_info_load["person"]["location"]["_content"] #if they have location information, it is added
                loc_list.append(loc)
            else:
                loc = "" #if they do not have location information, it will be left blank.
                loc_list.append(loc)
            # PROFILE LINK attribute
            prof = people_info_load["person"]["profileurl"]["_content"]
            prof_list.append(prof)
        #using pandas to create a data frame that can be converted to a csv
        #creating a dictionary where the key stores the values from the lists that were appended through the loops.
        df = pandas.DataFrame(data={"Photo ID" : self.ph_id_list, "Person ID" : self.per_id_list, "Username" : uname_list, "Realname" : rname_list, "Photo Title" : phtitle_list, "Photo Description" : phdesc_list, "Photo URL" : phurl_list, "Latitude" : lat_list, 
                                   "Longitude" : long_list, "Date/Time Taken": date_list, "Photo Tags" : tag_list, "Photo Registered Location" : regloc_list, "Profile Living Location" : loc_list,
                                    "Profile URL" : prof_list, "Photos Count" : count_list})
        #this will take away the brackets when brought to csv
        df["Photo Tags"] = df["Photo Tags"].apply(" , ".join)
        print("\nTotal points found: ",len(df.index))
        print ("\nCollection of Flickr photos has been completed.")
        return df

Chicago Bulls at the United Center

In [6]:
# photos is the main variable used with the criteria wanted.
photos = flickr.walk(lat=41.8806908, lon=-87.6763646, radius=5, min_taken_date= '2010-01-01', max_taken_date= '2019-12-31', extras='url_c', tags = "Bulls")     
#enter in photos to the class
pc = PhotoCollector(photos)
#now that photos is in PhotoCollector, you can execute the command.
results = pc.flickr_walk()
#since the lists are returned, you can store them in these lists for the Photo class.
ph_id_list = results[0]
per_id_list = results[1]

#bring the lists into Photo class        
p = Photo(ph_id_list, per_id_list)
#exectute the attribute function in Photo class
unitedcenter = p.attributes()
Total points found:  390

Collection of Flickr photos has been completed.

plotMap function takes the data frame from the given Flickr points and the stadium/arena location.

In [12]:
import descartes #descartes needed in order to create the map
import geopandas as gpd
import matplotlib.patches as patches

def plotMap(data, lat, long):
    frame = pandas.DataFrame({'Latitude': [lat],'Longitude':[long]})
    loc = gpd.GeoDataFrame(frame, geometry=gpd.points_from_xy(frame.Longitude, frame.Latitude))
    points = gpd.GeoDataFrame(data, geometry=gpd.points_from_xy(data.Longitude, data.Latitude))
    cookcounty = gpd.read_file("shp/Cook_County.shp")
    chi_boundary = gpd.read_file("shp/Boundaries - City.geojson") 
    loc.crs = {'init' :'epsg:26791'}
    points.crs = {'init' :'epsg:26791'}
    cookcounty.crs = {'init' :'epsg:26791'}
    chi_boundary.crs = {'init' :'epsg:26791'}
    base = chi_boundary.plot(color = 'black', edgecolor = 'white', figsize=(15, 15))
    base.set_facecolor('black')
    palette = {'Collected Points':'#7b3294', 'Location of Interest':'#008837'}
    list_of_items = ['Collected Points','Location of Interest']
    items_list =[]
    for i in list_of_items:
        label = i
        color = palette[i]
        items_list.append(patches.Patch(facecolor=color, label = label,alpha=0.9))
    points.plot(ax=base, marker='o', color = '#7b3294', markersize = 2)
    loc.plot(ax=base, marker='o', color = '#008837', markersize = 25)
    base.legend(handles = items_list, fontsize=15)
In [13]:
plotMap(unitedcenter, 41.8806908, -87.6763646)
C:\Users\kwmcnair\AppData\Local\Continuum\anaconda3\lib\site-packages\pyproj\crs\crs.py:53: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6
  return _prepare_from_string(" ".join(pjargs))

Chicago Cubs at Wrigley Field

In [14]:
photos = flickr.walk(lat=41.948164, lon=-87.655798, radius=5, min_taken_date= '2010-01-01', max_taken_date= '2019-12-31', extras='url_c', tags = "Cubs")     
pc = PhotoCollector(photos)
results = pc.flickr_walk()
ph_id_list = results[0]
per_id_list = results[1]
        
p = Photo(ph_id_list, per_id_list)
wrigleyfield = p.attributes()
Total points found:  499

Collection of Flickr photos has been completed.
In [15]:
plotMap(wrigleyfield, 41.948164, -87.655798)
C:\Users\kwmcnair\AppData\Local\Continuum\anaconda3\lib\site-packages\pyproj\crs\crs.py:53: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6
  return _prepare_from_string(" ".join(pjargs))

Chicago Bears at Soldier Field

In [17]:
photos = flickr.walk(lat=41.862188, lon=-87.616690, radius=5, min_taken_date= '2010-01-01', max_taken_date= '2019-12-31', extras='url_c', tags = "Bears")     
pc = PhotoCollector(photos)
results = pc.flickr_walk()
ph_id_list = results[0]
per_id_list = results[1]
        
p = Photo(ph_id_list, per_id_list)
soldierfield = p.attributes()
Total points found:  480

Collection of Flickr photos has been completed.
In [18]:
plotMap(soldierfield, 41.862188, -87.616690)
C:\Users\kwmcnair\AppData\Local\Continuum\anaconda3\lib\site-packages\pyproj\crs\crs.py:53: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6
  return _prepare_from_string(" ".join(pjargs))

White Sox at Guaranteed Rate Field

In [19]:
photos = flickr.walk(lat=41.830509, lon=-87.6335052, radius=5, min_taken_date= '2010-01-01', max_taken_date= '2019-12-31', extras='url_c', tags = "Sox")     
pc = PhotoCollector(photos)
results = pc.flickr_walk()
ph_id_list = results[0]
per_id_list = results[1]
        
p = Photo(ph_id_list, per_id_list)
g_rates = p.attributes()
Total points found:  342

Collection of Flickr photos has been completed.
In [20]:
plotMap(g_rates, 41.830509, -87.6335052)
C:\Users\kwmcnair\AppData\Local\Continuum\anaconda3\lib\site-packages\pyproj\crs\crs.py:53: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6
  return _prepare_from_string(" ".join(pjargs))

Chicago Blackhawks at the United Center

In [21]:
photos = flickr.walk(lat=41.8806908, lon=-87.6763646, radius=5, min_taken_date= '2010-01-01', max_taken_date= '2019-12-31', extras='url_c', tags = "Blackhawks")     
pc = PhotoCollector(photos)
results = pc.flickr_walk()
ph_id_list = results[0]
per_id_list = results[1]
        
p = Photo(ph_id_list, per_id_list)
uc_hawks = p.attributes()
Total points found:  490

Collection of Flickr photos has been completed.
In [22]:
plotMap(uc_hawks, 41.8806908, -87.6763646)
C:\Users\kwmcnair\AppData\Local\Continuum\anaconda3\lib\site-packages\pyproj\crs\crs.py:53: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6
  return _prepare_from_string(" ".join(pjargs))

Combine all the data together to map the distribution across Chicago

In [23]:
unitedcenter['Team'] = 'Bulls'
wrigleyfield['Team'] = 'Cubs'
soldierfield['Team'] = 'Bears'
g_rates['Team'] = 'White Sox'
uc_hawks['Team'] = 'Blackhawks'
frames = [unitedcenter, wrigleyfield, soldierfield, g_rates, uc_hawks]
combined = pandas.concat(frames)
combined = gpd.GeoDataFrame(combined, geometry=gpd.points_from_xy(combined.Longitude, combined.Latitude))
In [25]:
combined.crs = {'init' :'epsg:26791'}
chi_boundary = gpd.read_file("shp/Boundaries - City.geojson")
chi_boundary.crs = {'init' :'epsg:26791'}
base = chi_boundary.plot(color = 'black', edgecolor = 'white', figsize=(18, 18)) #make cook county black and the base of the map/plot
base.set_facecolor('black')
pointsPalette = {'Bulls':'#a6cee3', 'Cubs':'#1f78b4','Bears':'#b2df8a','White Sox':'#33a02c','Blackhawks':'#fb9a99'}
list_of_teams = ['Bulls','Cubs','Bears','White Sox','Blackhawks']
team_list =[]
for t in list_of_teams:
    label = t
    color = pointsPalette[t]
    team_list.append(patches.Patch(facecolor=color, label = label,alpha=0.9))
for team, data in combined.groupby('Team'):
    color = pointsPalette[team]
    label = team
    data.plot(color = color, ax = base, label = label, markersize = 5)
base.legend(handles = team_list, fontsize=15)
Out[25]:
<matplotlib.legend.Legend at 0x1b5ed3fdd88>

Out of my own curiosity, I wanted to know if this would show a similar cluster when using sklearn for k means clustering

In [26]:
from sklearn import datasets
from sklearn.cluster import KMeans
import random
import numpy as np
from matplotlib import pyplot as plt

xy_list = []
for index,row in combined.iterrows(): 
    coordinates = [] #coordinates dictionary
    lat = (float(row["Latitude"])) #finds the latitude in the json
    long = (float(row["Longitude"])) #finds the longitude in the json
    coordinates.append(long)
    coordinates.append(lat) #both lat and long are added into the coordinates list during each iteration.
    xytuple = (long,lat)
    xy_list.append(xytuple)

attributes = pandas.DataFrame(xy_list)
attributes.columns = ['X','Y']
model = KMeans(n_clusters=5, max_iter = 500)
model.fit(attributes)

print(model.labels_)

plt.figure(figsize=(18,18))
colormap = np.array(['#a6cee3', '#1f78b4', '#b2df8a','#33a02c','#fb9a99'])
plt.scatter(attributes.X, attributes.Y, c=colormap[model.labels_], s=30)
plt.title('K-Mean Clustering - 5 Clusters')
[2 4 4 ... 2 2 2]
Out[26]:
Text(0.5, 1.0, 'K-Mean Clustering - 5 Clusters')

A similar output; however, there is a lot of distribution of the flickr points of different teams within the Chicago Loop. The k means clustering divides that distribution.

In [27]:
import os

os.system('jupyter nbconvert --to html FlickrAPI.ipynb')
Out[27]:
0
In [ ]: